library(knitr)
library(readr)
library(tidyverse)
library(ggplot2)
library(purrr)
library(broom)
library(gridExtra)
library(tidyverse)
library(broom)
library(mice)
library(GGally)
source("../R/ExtractRecordall.R")
source("../R/importance.R")
source("../R/rest.R")
# loading data
Federer <- extract_records_all("../data/2017-ausopen-matches.csv",
"../data/2017-ausopen-points.csv", "Roger Federer")
Nadal = extract_records_all("../data/2017-ausopen-matches.csv",
"../data/2017-ausopen-points.csv", "Rafael Nadal")
Alex =extract_records_all("../data/2017-ausopen-matches.csv",
"../data/2017-ausopen-points.csv", "Alexander Zverev")
Raonic <- extract_records_all("../data/2017-ausopen-matches.csv",
"../data/2017-ausopen-points.csv", "Milos Raonic")
Kerber<- extract_records_all("../data/2017-ausopen-matches.csv",
"../data/2017-ausopen-points.csv", "Angelique Kerber")
Williams_v<- extract_records_all("../data/2017-ausopen-matches.csv",
"../data/2017-ausopen-points.csv", "Venus Williams")
Wozniacki <- extract_records_all("../data/2017-ausopen-matches.csv",
"../data/2017-ausopen-points.csv", "Caroline Wozniacki")
Williams_s<- extract_records_all("../data/2017-ausopen-matches.csv",
"../data/2017-ausopen-points.csv", "Serena Williams")
# create a list of dataset interested
player_dt<- list(Federer, Nadal, Alex, Raonic, Kerber, Wozniacki, Williams_v, Williams_s)
# create a list of player's name
name <- list("Roger Federer","Rafael Nadal", "Alexander Zverev","Milos Raonic", "Angelique Kerber", "Caroline Wozniacki", "Serena Williams", "Venus Williams")
female = list("Angelique Kerber", "Caroline Wozniacki", "Serena Williams", "Venus Williams")
male = list("Roger Federer","Rafael Nadal", "Alexander Zverev","Milos Raonic")
clean_data<- function(dt, name) {
dt %>%
mutate(name = name) %>%
mutate(rest = as.factor(rest(dt))) %>%
mutate(impt = point_impt(dt)) %>%
mutate(RallyCount = log(RallyCount)) %>%
# filter the double fault points
filter(Speed_KMH != 0) %>%
# filter the point serve by the player interested
filter(ServeIndicator == ifelse(player1 == name, 1,2)) %>%
dplyr::select(PointNumber,impt,dist,cum_dist, rest,time,MatchNo,SetNo,
ServeNumber, name, Speed_KMH, cum_time, RallyCount, Gender)
}
# clean the palyer_dt
player_dt_clean <- map2_df(player_dt_miss, name, clean_data)
cum_dist_resid <- augment(lm(cum_dist~PointNumber, data = player_dt_clean))$.resid
cum_time_resid<- augment(lm(cum_time~PointNumber, data = player_dt_clean))$.resid
player_dt_clean <- player_dt_clean %>% mutate(cum_dist_resid = cum_dist_resid) %>%
mutate(cum_time_resid = cum_time_resid)
# linear model
fit_lm <- function(data) lm(Speed_KMH~
PointNumber+ impt + time + dist + rest + MatchNo + RallyCount +
PointNumber * MatchNo + cum_dist_resid + cum_time_resid,
data = data)
build_linear_model <- function(dt, fit){
by_player_fit <- dt %>%
group_by(name) %>%
nest() %>%
mutate(model = map(data, fit))
return(by_player_fit)
}
fetch_coef <- function(dt){
player_coef_fit <- dt %>%
unnest(model %>% map(tidy)) %>%
dplyr::select(name, term,estimate) %>%
spread(term, estimate)
}
# split by first and second serve
player_dt_serve <- split(player_dt_clean,player_dt_clean$ServeNumber)
firstServe <- player_dt_serve[[1]]
SecondServe <- player_dt_serve[[2]]
# model for first serve for all four players
firstserve_lm <- build_linear_model(firstServe, fit_lm) %>%
mutate(servenumber =as.factor(1))
firstserve_coef <- firstserve_lm %>% fetch_coef()
firstserve_aug_lm <- firstserve_lm %>% unnest(model %>% map(augment))
firstserve_fit_lm <- firstserve_lm %>% unnest(model%>% map(glance))
# model for first serve for all four players
secondserve_lm <- build_linear_model(SecondServe, fit_lm) %>%
mutate(servenumber =as.factor(2))
secondserve_coef <- fetch_coef(secondserve_lm)
secondserve_aug_lm <- secondserve_lm %>% unnest(model %>% map(augment))
secondserve_fit_lm<- secondserve_lm %>% unnest(model%>% map(glance))
# augment first and second serve data together
model_lm<- rbind(firstserve_aug_lm, secondserve_aug_lm)
coefficient <- rbind(firstserve_coef,secondserve_coef)%>%
mutate(servenumber = c(1,1,1,1,1,1,1,1,
2,2,2,2,2,2,2,2))
This graph contrasts the player’s serving speed with the predicted value. The blue line has slope of 1, which is the position where a perfect prediction lays. The two clusters indicates the effectiveness of prediction when seperating the first and second serve in modelling. For Federer, Nadal and Zeverev, the difference between first and second serve is clear as shown in mainly two clusters, while Raonic doesn’t seem to have clear cut between the first and second serve speed.
| name | 1 | 2 |
|---|---|---|
| Alexander Zverev | 0.1169831 | 0.2649063 |
| Angelique Kerber | 0.3439414 | 0.5843132 |
| Caroline Wozniacki | 0.4012296 | 0.2175152 |
| Milos Raonic | 0.1303058 | 0.1140851 |
| Rafael Nadal | 0.1265345 | 0.1503419 |
| Roger Federer | 0.0608728 | 0.1226519 |
| Serena Williams | 0.1235949 | 0.3698460 |
| Venus Williams | 0.1759242 | 0.3598298 |
| term | Alexander Zverev | Angelique Kerber | Caroline Wozniacki | Milos Raonic | Rafael Nadal | Roger Federer | Serena Williams | Venus Williams |
|---|---|---|---|---|---|---|---|---|
| (Intercept) | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
| cum_dist_resid | 0.24 | 0.10 | 0.07 | 0.83 | 0.05 | 0.88 | 0.77 | 0.16 |
| cum_time_resid | 0.20 | 0.01 | 0.00 | 0.01 | 0.07 | 0.80 | 0.19 | 0.48 |
| dist | 0.27 | 0.66 | 0.16 | 0.04 | 0.84 | 0.14 | 0.88 | 0.15 |
| impt | 0.16 | 0.07 | 0.83 | 0.71 | 0.45 | 0.00 | 0.30 | 0.83 |
| MatchNo2 | 0.61 | 0.18 | 0.01 | 0.55 | 0.09 | 0.48 | 0.81 | 0.01 |
| MatchNo3 | 0.03 | 0.55 | 0.98 | 0.85 | 0.56 | 0.36 | 0.23 | 0.02 |
| MatchNo4 | NA | 0.70 | NA | 0.30 | 0.03 | 0.65 | 0.50 | 0.01 |
| MatchNo5 | NA | NA | NA | 0.10 | 0.10 | 0.77 | 0.66 | 0.12 |
| MatchNo6 | NA | NA | NA | NA | 0.36 | 0.70 | 0.70 | 0.01 |
| MatchNo7 | NA | NA | NA | NA | 0.18 | 0.60 | 0.90 | 0.02 |
| PointNumber | 0.26 | 0.02 | 0.01 | 0.09 | 0.01 | 0.52 | 0.14 | 0.70 |
| PointNumber:MatchNo2 | 0.99 | 0.02 | 0.00 | 0.62 | 0.93 | 0.29 | 0.78 | 0.11 |
| PointNumber:MatchNo3 | 0.48 | 0.05 | 0.00 | 0.35 | 0.25 | 0.76 | 0.33 | 0.21 |
| PointNumber:MatchNo4 | NA | 0.04 | NA | 0.15 | 0.15 | 0.23 | 0.79 | 0.90 |
| PointNumber:MatchNo5 | NA | NA | NA | 0.05 | 0.04 | 0.90 | 0.98 | 0.76 |
| PointNumber:MatchNo6 | NA | NA | NA | NA | 0.03 | 0.70 | 0.46 | 0.01 |
| PointNumber:MatchNo7 | NA | NA | NA | NA | 0.08 | 0.93 | 0.91 | 0.15 |
| RallyCount | 0.00 | 0.00 | 0.01 | 0.03 | 0.00 | 0.08 | 0.99 | 0.10 |
| rest1.5 | 0.15 | 0.76 | 0.02 | 0.11 | 0.38 | 0.66 | 0.28 | 0.62 |
| rest2 | 0.25 | 0.91 | NA | 0.03 | 0.06 | 0.55 | 0.68 | 0.73 |
| time | 0.19 | 0.67 | 0.40 | 0.71 | 0.00 | 0.70 | 0.48 | 0.43 |
| term | Alexander Zverev | Angelique Kerber | Caroline Wozniacki | Milos Raonic | Rafael Nadal | Roger Federer | Serena Williams | Venus Williams |
|---|---|---|---|---|---|---|---|---|
| (Intercept) | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
| cum_dist_resid | 0.96 | 0.12 | 0.64 | 0.87 | 0.96 | 0.68 | 0.04 | 0.59 |
| cum_time_resid | 0.93 | 0.94 | 0.68 | 0.48 | 0.96 | 0.52 | 0.77 | 0.42 |
| dist | 0.18 | 0.02 | 0.10 | 0.63 | 0.84 | 0.21 | 0.24 | 0.19 |
| impt | 0.57 | 0.27 | 0.85 | 0.30 | 0.73 | 0.28 | 0.27 | 0.14 |
| MatchNo2 | 0.17 | 0.00 | 0.37 | 0.58 | 0.07 | 0.20 | 0.31 | 0.12 |
| MatchNo3 | 0.95 | 0.03 | 0.23 | 0.75 | 0.87 | 0.83 | 0.68 | 0.58 |
| MatchNo4 | NA | 0.90 | NA | 0.37 | 0.73 | 0.50 | 0.68 | 0.20 |
| MatchNo5 | NA | NA | NA | 0.73 | 0.93 | 0.95 | 0.04 | 0.01 |
| MatchNo6 | NA | NA | NA | NA | 0.99 | 0.64 | 0.00 | 0.75 |
| MatchNo7 | NA | NA | NA | NA | 0.80 | 0.13 | 0.06 | 0.37 |
| PointNumber | 0.56 | 0.03 | 0.79 | 0.49 | 0.84 | 0.71 | 0.08 | 0.67 |
| PointNumber:MatchNo2 | 0.17 | 0.26 | 0.84 | 0.45 | 0.44 | 0.35 | 0.65 | 0.17 |
| PointNumber:MatchNo3 | 0.88 | 0.01 | 0.93 | 0.60 | 0.76 | 0.52 | 0.07 | 0.32 |
| PointNumber:MatchNo4 | NA | 0.34 | NA | 0.42 | 0.96 | 0.79 | 0.61 | 0.33 |
| PointNumber:MatchNo5 | NA | NA | NA | 0.57 | 0.98 | 0.78 | 0.12 | 0.23 |
| PointNumber:MatchNo6 | NA | NA | NA | NA | 0.84 | 1.00 | 0.03 | 0.92 |
| PointNumber:MatchNo7 | NA | NA | NA | NA | 0.58 | 0.67 | 0.13 | 0.20 |
| RallyCount | 0.42 | 0.25 | 0.59 | 0.09 | 0.14 | 0.01 | 0.78 | 0.07 |
| rest1.5 | 0.89 | 0.66 | 0.79 | 0.82 | 0.44 | 0.81 | 0.93 | 0.32 |
| rest2 | 0.25 | NA | 0.74 | 0.97 | 0.97 | 0.94 | 0.36 | 0.98 |
| time | 0.31 | 0.97 | 0.82 | 0.50 | 0.07 | 0.50 | 0.43 | 0.46 |
The correlation between Point Number and cummulated match time is high and the plot shows the relationship is linear. Thus it doesn’t matter much to use point number or match time as x variable when plotting the models.
## [1] 0.9863859
| name | 1 | 2 |
|---|---|---|
| Alexander Zverev | 0.1219240 | -0.0599029 |
| Angelique Kerber | -0.3784410 | -0.2051566 |
| Caroline Wozniacki | -0.4652797 | 0.0893456 |
| Milos Raonic | 0.2120854 | 0.1238700 |
| Rafael Nadal | -0.1679667 | 0.0175697 |
| Roger Federer | -0.0424842 | 0.0292872 |
| Serena Williams | -0.0779580 | 0.1021910 |
| Venus Williams | -0.0713049 | 0.0707765 |
For all players, there’s significant difference between first and second serve speed. We can also find that on average, Male’s serving is significantly faster than female’s.
For the first serve, Zeverev and Raonic have their serving speeds mostly above 175 KMH, which are much higher than those of Nadal or Federer’s. Notice that Nadal does have a few fast serve at around 200 KMH.
For the second serve, Zeverev shows clear evidence of reducing of speed for each match and this could be due to the fact of his young age, thus lack of experience or fatigue. For Raonic, we can find some evidence of reduce of serving speed but the variation of serve varies a lot for each match. For Nadal and Federer, whose serving speed is relatively consistent across game, fatigue can be captured by the variation of the serving speed.
Looking at female’s data, we could see that Serena and Venus Williams, who played the final game shows a relatively stable serving speed like Federer and Nadal. While Kerber has a similar high variation of serving speed like Raonic and Wozniakic’s first serve seems to increase as the game proceeds.
Based on these, we could capture the fatigue through the reduce of the serving speed (slope) as well as the variation of the serving speed (variance). Attention need to be paid to players like Raonic, whose serving speed naturally variates a lot in each match and Wozniakic, whose first serve seems to go against our hypothesis that serving speed will decrease as the game proceeds.
| name | 1 | 2 |
|---|---|---|
| Alexander Zverev | 0.0947309 | -0.1325152 |
| Angelique Kerber | -0.0581313 | -0.1912024 |
| Caroline Wozniacki | -0.1298826 | 0.2456916 |
| Milos Raonic | -0.2356662 | -0.0928441 |
| Rafael Nadal | 0.0091780 | -0.0145209 |
| Roger Federer | -0.1165277 | -0.1356490 |
| Serena Williams | -0.0159290 | 0.1159422 |
| Venus Williams | -0.2304942 | 0.1787765 |
Running distance in general doesn’t seem to affect much on the male’s serving speed, although little evidence (Nadal’s second serve) supports that it may reduce the second serving speed.
While for female, running distance seems to be an increasing factor of the second serving speed. We can see that Wozniaki’s “seemingly increasing serving speed” in the previous graph is due to the increase of second serve speed. Serena also exhibit this pattern in the second serve.
Another thing to notice is that due to the nature of the female’s game (3 games a match rather than 5 games as male’s). We observe less data for female than male, thus we would expect female data to have higher variation (i.e. Kerber and Venus)
| name | 1 | 2 |
|---|---|---|
| Alexander Zverev | 36.794522 | -15.384869 |
| Angelique Kerber | -61.497941 | -21.759364 |
| Caroline Wozniacki | 9.605531 | -13.445716 |
| Milos Raonic | 14.244861 | -53.816542 |
| Rafael Nadal | 10.098873 | 6.369603 |
| Roger Federer | 57.017134 | -27.113367 |
| Serena Williams | 24.504677 | 31.449178 |
| Venus Williams | -7.872917 | -56.912844 |
| name | 1 | 2 |
|---|---|---|
| Alexander Zverev | 7.705043 | 0.6373962 |
| Angelique Kerber | -2.430421 | 1.8701346 |
| Caroline Wozniacki | -16.059879 | -2.9245111 |
| Milos Raonic | 10.287678 | 1.4194872 |
| Rafael Nadal | 2.117839 | 2.9106437 |
| Roger Federer | -1.506890 | -1.1257766 |
| Serena Williams | 6.707852 | -0.3783389 |
| Venus Williams | -3.473431 | 7.2900524 |
| name | 1 | 2 |
|---|---|---|
| Alexander Zverev | 14.774285 | -8.2075095 |
| Angelique Kerber | -2.162510 | NA |
| Caroline Wozniacki | NA | -3.5098499 |
| Milos Raonic | 18.376758 | -0.5150704 |
| Rafael Nadal | 7.727629 | 0.1813265 |
| Roger Federer | 3.800889 | -0.3933583 |
| Serena Williams | -5.488399 | 5.3713929 |
| Venus Williams | -2.714327 | -0.1430077 |
In general, after having the game break, players tends to have higher serving speed, which indicates less fatigue. The improvement for Nadal is marginal while it is more obvious in Zverev’s first serve and Federer’s Second serve. Raonic’s behaviour is interesting in a sense that after each game break, his serving usually drop (green dots) while after the set break, he would have faster serving (blue dotss)
In female players, Kerber and Wozniaki behave similarly with a decrease of serving speed after the scheduled break. While for Serena Williams, she has similar behaviour to top male players with marginal increase of serving speed after the breaks
| name | 1 | 2 |
|---|---|---|
| Alexander Zverev | -0.0778764 | 0.0415548 |
| Angelique Kerber | -0.0430758 | -0.0020048 |
| Caroline Wozniacki | 0.0601028 | 0.0271098 |
| Milos Raonic | 0.0188585 | 0.0323112 |
| Rafael Nadal | -0.0546811 | -0.0625478 |
| Roger Federer | 0.0140779 | 0.0324863 |
| Serena Williams | -0.0424071 | 0.0309649 |
| Venus Williams | 0.0499157 | -0.0343461 |
In general, as the point is played longer, player’s serving speed will decrease marginally (i.e. Nadal). However, for Raonic, it seems to have a positive effect on the serving speed
| name | 1 | 2 |
|---|---|---|
| Alexander Zverev | -3.8048180 | -0.8309095 |
| Angelique Kerber | -4.6230006 | -0.9543669 |
| Caroline Wozniacki | -2.8613005 | -1.2217676 |
| Milos Raonic | -3.2077052 | -3.4552284 |
| Rafael Nadal | -1.9195151 | -1.1165007 |
| Roger Federer | -1.3863793 | -2.9550029 |
| Serena Williams | 0.0131586 | -0.2913815 |
| Venus Williams | -2.2717852 | -2.1622715 |
From the first plot, Raonic and Federer play relatively fewer long rally points, while Zverev and Nadal has more long rally points, which may help to understand if rally count would have an effect on fatigue (Serving speed)
The number of rally played in each game is also a factor that would decrease the serving speed and the effect is obvious for the first serve of Zeverev, Kerber, Venus Williams . It is interesting to know that although Nadal and Serena Williams have played a relative number of long rally game, it doesnt seem to affect his serving speed much.
| name | 1 | 2 |
|---|---|---|
| Alexander Zverev | -0.0241589 | 0.0009288 |
| Angelique Kerber | 0.0487538 | 0.0282494 |
| Caroline Wozniacki | 0.0565705 | -0.0254181 |
| Milos Raonic | -0.0063095 | -0.0079096 |
| Rafael Nadal | 0.0271139 | -0.0010073 |
| Roger Federer | 0.0023594 | 0.0077291 |
| Serena Williams | 0.0083055 | -0.0617074 |
| Venus Williams | -0.0963621 | 0.0400308 |
| name | 1 | 2 |
|---|---|---|
| Alexander Zverev | 1.0283805 | -0.0674622 |
| Angelique Kerber | -3.6751248 | 0.0641342 |
| Caroline Wozniacki | -5.3762613 | -0.8874875 |
| Milos Raonic | 1.3960296 | 0.6083765 |
| Rafael Nadal | -0.6340924 | -0.0236971 |
| Roger Federer | -0.1089691 | 0.3334647 |
| Serena Williams | -1.0976845 | -0.2455470 |
| Venus Williams | -1.1073221 | -1.1804808 |